# Libraries used in this analysis
# I. universal data wrangling and visualization
library(tidyverse) # ggplot, dplyr, purrr, forcats, stringr, etc. See https://tidyverse.tidyverse.org/
library(tidyselect) # useful for subsetting dataframes
library(gtools)
# II. formatting, plotting
library(scales) # used in formatting relative abundance % labels; dollar_format()
library(knitr) # necessary for kable, for displaying nicer tables in html/pdf output
library(ggplot2) # customizable data visualization
library(ggpubr) # data visualization, built on ggplot2
# III. analysis packages
library(vegan) # decostand(), metaMDS(), scores(), rda(), ordispider(), adonis()
# IV. functions for automating and streamlining workflow
#library(devtools)
#devtools::install_github("https://github.com/cb-42/cbmbtools")
library(cbmbtools)
#V. Rmarkdown formatting
library(pander) # format session info
pander(sessionInfo())
R version 4.0.2 (2020-06-22)
Platform: x86_64-apple-darwin17.0 (64-bit)
locale: en_US.UTF-8||en_US.UTF-8||en_US.UTF-8||C||en_US.UTF-8||en_US.UTF-8
attached base packages: stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: pander(v.0.6.3), cbmbtools(v.0.0.0.9034), vegan(v.2.5-6), lattice(v.0.20-41), permute(v.0.9-5), ggpubr(v.0.4.0), knitr(v.1.29), scales(v.1.1.1), gtools(v.3.8.2), tidyselect(v.1.1.0), forcats(v.0.5.0), stringr(v.1.4.0), dplyr(v.1.0.1), purrr(v.0.3.4), readr(v.1.3.1), tidyr(v.1.1.1), tibble(v.3.0.3), ggplot2(v.3.3.2) and tidyverse(v.1.3.0)
loaded via a namespace (and not attached): httr(v.1.4.2), jsonlite(v.1.7.0), splines(v.4.0.2), carData(v.3.0-4), modelr(v.0.1.8), assertthat(v.0.2.1), blob(v.1.2.1), cellranger(v.1.1.0), yaml(v.2.2.1), pillar(v.1.4.6), backports(v.1.1.8), glue(v.1.4.1), digest(v.0.6.25), ggsignif(v.0.6.0), rvest(v.0.3.6), colorspace(v.1.4-1), Matrix(v.1.2-18), htmltools(v.0.5.0), pkgconfig(v.2.0.3), broom(v.0.7.0), haven(v.2.3.1), openxlsx(v.4.1.5), rio(v.0.5.16), mgcv(v.1.8-31), generics(v.0.0.2), car(v.3.0-9), ellipsis(v.0.3.1), withr(v.2.2.0), cli(v.2.0.2), magrittr(v.1.5), crayon(v.1.3.4), readxl(v.1.3.1), evaluate(v.0.14), fs(v.1.5.0), fansi(v.0.4.1), nlme(v.3.1-148), MASS(v.7.3-51.6), rstatix(v.0.6.0), xml2(v.1.3.2), foreign(v.0.8-80), tools(v.4.0.2), data.table(v.1.13.0), hms(v.0.5.3), lifecycle(v.0.2.0), munsell(v.0.5.0), reprex(v.0.3.0), cluster(v.2.1.0), zip(v.2.0.4), compiler(v.4.0.2), rlang(v.0.4.7), grid(v.4.0.2), rstudioapi(v.0.11), rmarkdown(v.2.3), codetools(v.0.2-16), gtable(v.0.3.0), abind(v.1.4-5), DBI(v.1.1.0), curl(v.4.3), R6(v.2.4.1), lubridate(v.1.7.9), stringi(v.1.4.6), parallel(v.4.0.2), Rcpp(v.1.0.5), vctrs(v.0.3.2), dbplyr(v.1.4.4) and xfun(v.0.16)
# function for computing summary statistics - error bar helper
sem <- function(x){
sqrt(var(x, na.rm = TRUE)/length(na.omit(x)))
}
set.seed(3132)
#data prep
ddpcr <- ddpcr %>% group_by(Sample_Type)
ddpcr.summary <- summarize(ddpcr,
Mean=mean(Gene_16S_copies_per_mL),
SEM=sqrt(var(Gene_16S_copies_per_mL)/length(Gene_16S_copies_per_mL)))
ggplot(ddpcr, aes(x=Sample_Type)) +
geom_col(data=ddpcr.summary, aes(x=Sample_Type, y=Mean, fill=Sample_Type), color= "black", show.legend = FALSE) +
geom_point(aes(x=Sample_Type, y=Gene_16S_copies_per_mL)) +
geom_errorbar(data=ddpcr.summary, aes(ymin = Mean-SEM, ymax = Mean+SEM, width = 0.3)) +
theme_classic() +
scale_y_log10(
limits = c(10^0, 10^6),
breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x)), expand = c(0,0)) +
theme(axis.text.y = element_text(size=12),
axis.text.x = element_text(size=10),
axis.title.y = element_text(size=12, face="bold", angle = 0, vjust=0.5),
axis.title.x = element_text(size=12, face="bold"),
plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm")) +
ylab("16S rRNA Gene\nCopies/mL") +
xlab("\nSample Type")
#overall difference between groups - kruskal wallis test for multiple group comparisons
kruskal.test(Gene_16S_copies_per_mL ~ Sample_Type, data=ddpcr)
#pairwise difference between groups - Wilcoxon rank sum test, adjusted for multiple comparisons
pairwise.wilcox.test(ddpcr$Gene_16S_copies_per_mL, ddpcr$Sample_Type, p.adjust.method = "BH")
Kruskal-Wallis rank sum test
data: Gene_16S_copies_per_mL by Sample_Type
Kruskal-Wallis chi-squared = 29.276, df = 6, p-value = 5.394e-05
Pairwise comparisons using Wilcoxon rank sum exact test
data: ddpcr$Gene_16S_copies_per_mL and ddpcr$Sample_Type
NTC Iso_Ctrl PBS Syringe_Rinse Homog_Ctrl BAL
Iso_Ctrl 0.64000 - - - - -
PBS 0.58947 0.40000 - - - -
Syringe_Rinse 0.42353 0.03333 0.40000 - - -
Homog_Ctrl 0.58947 0.16667 0.42353 0.40000 - -
BAL 0.22238 0.00839 0.31818 0.27082 0.90909 -
Lung 0.00839 0.00262 0.07955 0.00839 0.07955 0.00023
P value adjustment method: BH
#prepare data for plotting
reads.by.sample.type.df <- otu_df
reads.by.sample.type.df$Sample_Type2 <- factor(reads.by.sample.type.df$Sample_Type2,
levels = c("Mock", "Water", "Empty", "AE", "Iso_Ctrl", "PBS", "Syringe_Rinse", "Homog_Ctrl", "BAL", "Lung", "Tongue", "Cecum"))
#plot number of MiSeq reads by sample type
ggplot(reads.by.sample.type.df, aes(x=Sample_Type2, y=Reads.PF.Sample)) +
geom_boxplot(aes(fill = Sample_Type2), show.legend=FALSE) +
geom_point() +
theme_classic() +
scale_y_log10(
limits = c(10^0, 10^6),
breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x)), expand = c(0,0)) +
theme(axis.text.y = element_text(size=12),
axis.text.x = element_text(size=12),
axis.title.y = element_text(size=15, face="bold", angle = 0, vjust=0.5),
axis.title.x = element_text(size=15, face="bold"),
plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm")) +
ylab("MiSeq\nReads") +
xlab("\nSample Type")
# prepare data for plotting
otu.total.counts.byExpt <- bind_cols(otu_df[,c("Sample", "Sample_Type2", "Organ")], OTU_Count = rowSums(otu_good)) %>%
group_by(Sample_Type2)
otu.total.counts.byExpt$Sample_Type2 <- factor(otu.total.counts.byExpt$Sample_Type2,
levels = c("Mock", "Water", "Empty", "AE", "Iso_Ctrl", "PBS", "Syringe_Rinse", "Homog_Ctrl", "BAL", "Lung", "Tongue", "Cecum"))
# plot OTU counts by sample type
ggplot(otu.total.counts.byExpt, aes(x=Sample_Type2, y=OTU_Count)) +
geom_boxplot(aes(fill = Sample_Type2), show.legend=FALSE) +
geom_point() +
theme_classic() +
scale_y_log10(
limits = c(10^0, 10^6),
breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x)), expand = c(0,0)) +
theme(axis.text.y = element_text(size=12),
axis.text.x = element_text(size=12),
axis.title.y = element_text(size=15, face="bold", angle = 0, vjust=0.5),
axis.title.x = element_text(size=15, face="bold"),
plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm")) +
ylab("OTU\nCount") +
xlab("\nSample Type")
#extract samples of interest with specified strings - returns vector of strings and NAs
extract.lungsamp.sampctrls <- str_extract(rownames(otu_good), pattern="_BAL_|_Lung_|PBS|CLEAN|USED|Homog")
#select subsets based on extracted samp names - returns a logical vector (NA --> FALSE)
extract.lungsamp.sampctrls.good <- !is.na(extract.lungsamp.sampctrls)
#subset otu counts
otu.good.lungsamp.sampctrls <- otu_good[extract.lungsamp.sampctrls.good, ]
#subset otu.df for PCA
otu.df.lungsamp.sampctrls <- filter(otu_df, Sample_Type == "BAL" | Sample_Type == "Lung" | Sample_Type == "PBS" | Sample_Type == "Clean_Rinse" | Sample_Type == "Used_Rinse" | Sample_Type == "Homog_Ctrl")
#make a hellinger normalized df and pca object from the normalized df
make_hel_pca(otu.good.lungsamp.sampctrls)
# summarize pca object to allow for pulling the coordinates in the PC space
lungsamp_sampctrls_pca_summary <- summary(otu.good.lungsamp.sampctrls_pca)
# select PC1 and PC2 to plot
#"sites" refers to each individual sample, i.e. ecologic sites
pc_space <- lungsamp_sampctrls_pca_summary$sites %>% as.data.frame() %>% rownames_to_column() %>% dplyr::select(1:3)
colnames(pc_space) <- c("Sample_Name", "PC1", "PC2")
pc_loadings <- lungsamp_sampctrls_pca_summary$species # species as loading vectors
# attach your data from the pca object to your labels
centroid_df <- data.frame(Sample_Name = pc_space$Sample_Name, Sample_Type2 = otu.df.lungsamp.sampctrls[,"Sample_Type2"], PC1 = pc_space$PC1, PC2 = pc_space$PC2)
# calculate the mean x coordinate and y coordinate for each sample in PC1/PC2 space
centroids <- centroid_df %>%
group_by(Sample_Type2) %>%
summarize(x_coord=mean(PC1), y_coord=mean(PC2)) %>%
ungroup()
# to EACH SUBJECT in the pc object, add a column which contains the x and y
#coordinates for the centroid for the group that the subject belongs to
pc_plot_data <- full_join(centroid_df, centroids, by ="Sample_Type2")
# Now you can use your dataframe and mainpulate it how you want
pc_plot_data$Sample_Type2 <- factor(pc_plot_data$Sample_Type2,
levels = c("Lung", "BAL", "PBS", "Syringe_Rinse", "Homog_Ctrl"),
labels = c("Lung", "BAL", "Saline", "Syringe Rinse", "Homogenization\nControl"))
pca_pal <- c( "#EB3D0E", #scarlet - lung
"#0300A6", # blue - BAL
"#666B6E", #- saline
"#9A99E9", #- used rinse
"#F2C121" # saffron - water
)
# plot PCA
ggplot(pc_plot_data, aes(x = PC1, y = PC2)) +
geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.5) +
geom_segment(aes(x=x_coord, xend = PC1, y=y_coord, yend = PC2), color = "black") +
geom_point(show.legend = FALSE, aes(color = Sample_Type2), size = 3) +
# segment starting at centroid, ending at point
geom_label(aes(x=x_coord, y=y_coord, label=Sample_Type2, color=Sample_Type2), size=3, show.legend = FALSE) + # centroid information
theme_classic() +
theme(legend.title = element_blank(),
axis.text.y = element_text(size=15),
axis.text.x = element_text(size=15),
axis.title.y = element_text(size=12, face="bold", vjust = 0),
axis.title.x = element_text(size=12, face="bold", vjust = 0),
plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm")
) +
scale_x_continuous(limits = c(-1, 1), breaks = seq(-1, 1, 0.5)) +
scale_y_continuous(limits = c(-1.1, 1.1), breaks = seq(-1,1,0.5)) +
labs(
x = "\nPrincipal Component 1 (13.7% explained)",
y = "Principal Component 2 (7.0% explained)\n"
) + scale_color_manual(values=pca_pal)
#prepare hellinger-transformed df to be filtered for pairwise comparisons
adonis.select.vec <- dplyr::select(otu.df.lungsamp.sampctrls, Sample:Sample_Type)
adonis.hel.df <- rownames_to_column(otu.good.lungsamp.sampctrls_hel, var = "Sample")
adonis.hel.df <- left_join(adonis.hel.df, adonis.select.vec, by = "Sample") %>%
dplyr::select(Sample, Sample_Type, everything())
#multiple comparisons - whole lung v. BAL v. pooled sampling controls
adonis.hel.df.wbn <- column_to_rownames(adonis.hel.df, var = "Sample") %>%
dplyr::select(-Sample_Type)
adonis(adonis.hel.df.wbn~otu.df.lungsamp.sampctrls$RA_Groups, method="euclidean", permutations=10000)
#pairwise comparisons - whole lung v. bal
adonis.hel.bl <- dplyr::filter(adonis.hel.df, Sample_Type == "Lung" | Sample_Type == "BAL") %>%
column_to_rownames(var = "Sample") %>%
dplyr::select(-Sample_Type)
adonis.otudf.bl <- dplyr::filter(otu.df.lungsamp.sampctrls,
Sample_Type == "Lung" | Sample_Type == "BAL")
adonis(adonis.hel.bl~adonis.otudf.bl$Sample_Type, method="euclidean", permutations=10000)
#pairwise comparison - whole lung v. sampling negative controls
adonis.hel.wn <- dplyr::filter(adonis.hel.df, Sample_Type != "BAL") %>%
column_to_rownames(var = "Sample") %>%
dplyr::select(-Sample_Type)
adonis.otudf.wn <- dplyr::filter(otu.df.lungsamp.sampctrls, Sample_Type != "BAL")
adonis(adonis.hel.wn~adonis.otudf.wn$RA_Groups, method="euclidean", permutations=10000)
#pairwise comparison - BAL v. sampling negative controls
adonis.hel.bn <- dplyr::filter(adonis.hel.df, Sample_Type != "Lung") %>%
column_to_rownames(var = "Sample") %>%
dplyr::select(-Sample_Type)
adonis.otudf.bn <- dplyr::filter(otu.df.lungsamp.sampctrls, Sample_Type != "Lung")
adonis(adonis.hel.bn~adonis.otudf.bn$Organ, method="euclidean", permutations=10000)
Call:
adonis(formula = adonis.hel.df.wbn ~ otu.df.lungsamp.sampctrls$RA_Groups, permutations = 10000, method = "euclidean")
Permutation: free
Number of permutations: 10000
Terms added sequentially (first to last)
Df SumsOfSqs MeanSqs F.Model R2
otu.df.lungsamp.sampctrls$RA_Groups 2 3.8879 1.94394 2.3806 0.16554
Residuals 24 19.5980 0.81658 0.83446
Total 26 23.4859 1.00000
Pr(>F)
otu.df.lungsamp.sampctrls$RA_Groups 9.999e-05 ***
Residuals
Total
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Call:
adonis(formula = adonis.hel.bl ~ adonis.otudf.bl$Sample_Type, permutations = 10000, method = "euclidean")
Permutation: free
Number of permutations: 10000
Terms added sequentially (first to last)
Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
adonis.otudf.bl$Sample_Type 1 2.4484 2.4484 3.1334 0.15563 9.999e-05 ***
Residuals 17 13.2838 0.7814 0.84437
Total 18 15.7323 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Call:
adonis(formula = adonis.hel.wn ~ adonis.otudf.wn$RA_Groups, permutations = 10000, method = "euclidean")
Permutation: free
Number of permutations: 10000
Terms added sequentially (first to last)
Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
adonis.otudf.wn$RA_Groups 1 2.4698 2.46976 3.3064 0.18061 4e-04 ***
Residuals 15 11.2046 0.74697 0.81939
Total 16 13.6744 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Call:
adonis(formula = adonis.hel.bn ~ adonis.otudf.bn$Organ, permutations = 10000, method = "euclidean")
Permutation: free
Number of permutations: 10000
Terms added sequentially (first to last)
Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
adonis.otudf.bn$Organ 1 0.9148 0.91483 0.99522 0.05856 0.463
Residuals 16 14.7076 0.91922 0.94144
Total 17 15.6224 1.00000
#extract samples of interest with specified strings - returns vector of strings and NAs
extract.lungsamp.isoctrls <- str_extract(rownames(otu_good), pattern="_BAL_|_Lung_|_IsoCtrl_|AE")
#select subsets based on extracted samp names - returns a logical vector (NA --> FALSE)
extract.lungsamp.isoctrls.good <- !is.na(extract.lungsamp.isoctrls)
#subset otu counts
otu.good.lungsamp.isoctrls <- otu_good[extract.lungsamp.isoctrls.good, ]
#subset otu.df for PCA
otu.df.lungsamp.isoctrls <- filter(otu_df, Sample_Type == "BAL" | Sample_Type == "Lung" | Sample_Type == "Iso_Ctrl" | Sample_Type == "AE")
#make a hellinger normalized df and pca object from the normalized df
make_hel_pca(otu.good.lungsamp.isoctrls)
# summarize pca object to allow for pulling the coordinates in the PC space
lungsamp_isoctrls_pca_summary <- summary(otu.good.lungsamp.isoctrls_pca)
# select PC1 and PC2 to plot
#"sites" refers to each individual sample, i.e. ecologic sites
pc_space <- lungsamp_isoctrls_pca_summary$sites %>% as.data.frame() %>% rownames_to_column() %>% dplyr::select(1:3)
colnames(pc_space) <- c("Sample_Name", "PC1", "PC2")
pc_loadings <- lungsamp_isoctrls_pca_summary$species # species as loading vectors
# attach your data from the pca object to your labels
centroid_df <- data.frame(Sample_Name = pc_space$Sample_Name, Sample_Type2 = otu.df.lungsamp.isoctrls[,"Sample_Type2"], PC1 = pc_space$PC1, PC2 = pc_space$PC2)
# calculate the mean x coordinate and y coordinate for each sample in PC1/PC2 space
centroids <- centroid_df %>%
group_by(Sample_Type2) %>%
summarize(x_coord=mean(PC1), y_coord=mean(PC2)) %>%
ungroup()
# to EACH SUBJECT in the pc object, add a column which contains the x and y
#coordinates for the centroid for the group that the subject belongs to
pc_plot_data <- full_join(centroid_df, centroids, by ="Sample_Type2")
# Now you can use your dataframe and mainpulate it how you want
pc_plot_data$Sample_Type2 <- factor(pc_plot_data$Sample_Type2,
levels = c("Lung", "BAL", "AE", "Iso_Ctrl"),
labels = c("Lung", "BAL", "Elution\nBuffer", "Isolation\nControl"))
pca_pal <- c( "#EB3D0E", #scarlet - lung
"#0300A6", # blue - BAL
"#DBE6EC", #- AE
"#666B6E" #- iso ctrl
)
# plot PCA
ggplot(pc_plot_data, aes(x = PC1, y = PC2)) +
geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.5) +
geom_segment(aes(x=x_coord, xend = PC1, y=y_coord, yend = PC2), color = "black") +
geom_point(show.legend = FALSE, aes(color = Sample_Type2), size = 3) +
# segment starting at centroid, ending at point
geom_label(aes(x=x_coord, y=y_coord, label=Sample_Type2, color=Sample_Type2), size=3, show.legend = FALSE) + # centroid information
theme_classic() +
theme(legend.title = element_blank(),
axis.text.y = element_text(size=15),
axis.text.x = element_text(size=15),
axis.title.y = element_text(size=12, face="bold", vjust = 0),
axis.title.x = element_text(size=12, face="bold", vjust = 0),
plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm")
) +
scale_color_manual(values = pca_pal) +
scale_x_continuous(limits = c(-1, 1), breaks = seq(-1, 1, 0.5)) +
scale_y_continuous(limits = c(-1.1, 1.1), breaks = seq(-1,1,0.5)) +
labs(
x = "\nPrincipal Component 1 (12.1% explained)",
y = "Principal Component 2 (9.0% explained)\n"
)
#prepare hellinger-transformed df to be filtered for pairwise comparisons
adonis.select.vec <- dplyr::select(otu.df.lungsamp.isoctrls, Sample:Sample_Type)
adonis.hel.df <- rownames_to_column(otu.good.lungsamp.isoctrls_hel, var = "Sample")
adonis.hel.df <- left_join(adonis.hel.df, adonis.select.vec, by = "Sample") %>%
dplyr::select(Sample, Sample_Type, everything())
#multiple comparisons - whole lung v. BAL v. pooled isolation controls
adonis(otu.good.lungsamp.isoctrls_hel~otu.df.lungsamp.isoctrls$Sample_Type,
method="euclidean", permutations=10000)
#pairwise comparison - whole lung v. iso ctrls
adonis.hel.il <- dplyr::filter(adonis.hel.df, Sample_Type != "BAL") %>%
column_to_rownames(var = "Sample") %>% dplyr::select(-Sample_Type)
adonis.otudf.il <- dplyr::filter(otu.df.lungsamp.isoctrls, Sample_Type != "BAL")
adonis(adonis.hel.il~adonis.otudf.il$RA_Groups, method="euclidean", permutations=10000)
#pairwise comparison - BAL v. iso ctrls
adonis.hel.ib <- dplyr::filter(adonis.hel.df, Sample_Type != "Lung") %>%
column_to_rownames(var = "Sample") %>% dplyr::select(-Sample_Type)
adonis.otudf.ib <- dplyr::filter(otu.df.lungsamp.isoctrls, Sample_Type != "Lung")
adonis(adonis.hel.ib~adonis.otudf.ib$RA_Groups, method="euclidean", permutations=10000)
Call:
adonis(formula = otu.good.lungsamp.isoctrls_hel ~ otu.df.lungsamp.isoctrls$Sample_Type, permutations = 10000, method = "euclidean")
Permutation: free
Number of permutations: 10000
Terms added sequentially (first to last)
Df SumsOfSqs MeanSqs F.Model R2
otu.df.lungsamp.isoctrls$Sample_Type 3 5.0693 1.6898 2.0376 0.1846
Residuals 27 22.3910 0.8293 0.8154
Total 30 27.4603 1.0000
Pr(>F)
otu.df.lungsamp.isoctrls$Sample_Type 9.999e-05 ***
Residuals
Total
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Call:
adonis(formula = adonis.hel.il ~ adonis.otudf.il$RA_Groups, permutations = 10000, method = "euclidean")
Permutation: free
Number of permutations: 10000
Terms added sequentially (first to last)
Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
adonis.otudf.il$RA_Groups 1 2.6804 2.68040 3.4196 0.15253 9.999e-05 ***
Residuals 19 14.8927 0.78383 0.84747
Total 20 17.5731 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Call:
adonis(formula = adonis.hel.ib ~ adonis.otudf.ib$RA_Groups, permutations = 10000, method = "euclidean")
Permutation: free
Number of permutations: 10000
Terms added sequentially (first to last)
Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
adonis.otudf.ib$RA_Groups 1 1.2087 1.20866 1.3141 0.06165 0.07149 .
Residuals 20 18.3957 0.91978 0.93835
Total 21 19.6044 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#extract samples of interest with specified strings - returns vector of strings and NAs
extract.lungsamp.seqctrls <- str_extract(rownames(otu_good), pattern="_BAL_|_Lung_|Water|Blank")
#select subsets based on extracted samp names - returns a logical vector (NA --> FALSE)
extract.lungsamp.seqctrls.good <- !is.na(extract.lungsamp.seqctrls)
#subset otu counts
otu.good.lungsamp.seqctrls <- otu_good[extract.lungsamp.seqctrls.good, ]
#subset otu.df for PCA
otu.df.lungsamp.seqctrls <- filter(otu_df, Sample_Type == "BAL" | Sample_Type == "Lung" | Sample_Type == "Water" | Sample_Type == "Empty")
#make a hellinger normalized df and pca object from the normalized df
make_hel_pca(otu.good.lungsamp.seqctrls)
# summarize pca object to allow for pulling the coordinates in the PC space
lungsamp_seqctrls_pca_summary <- summary(otu.good.lungsamp.seqctrls_pca)
# select PC1 and PC2 to plot
#"sites" refers to each individual sample, i.e. ecologic sites
pc_space <- lungsamp_seqctrls_pca_summary$sites %>% as.data.frame() %>% rownames_to_column() %>% dplyr::select(1:3)
colnames(pc_space) <- c("Sample_Name", "PC1", "PC2")
pc_loadings <- lungsamp_seqctrls_pca_summary$species # species as loading vectors
# attach your data from the pca object to your labels
centroid_df <- data.frame(Sample_Name = pc_space$Sample_Name, Sample_Type2 = otu.df.lungsamp.seqctrls[,"Sample_Type2"], PC1 = pc_space$PC1, PC2 = pc_space$PC2)
# calculate the mean x coordinate and y coordinate for each sample in PC1/PC2 space
centroids <- centroid_df %>%
group_by(Sample_Type2) %>%
summarize(x_coord=mean(PC1), y_coord=mean(PC2)) %>%
ungroup()
# to EACH SUBJECT in the pc object, add a column which contains the x and y
#coordinates for the centroid for the group that the subject belongs to
pc_plot_data <- full_join(centroid_df, centroids, by ="Sample_Type2")
# Now you can use your dataframe and mainpulate it how you want
pc_plot_data$Sample_Type2 <- factor(pc_plot_data$Sample_Type2,
levels = c("Lung", "BAL", "Empty", "Water"),
labels = c("Lung", "BAL", "Empty\nWell", "Water"))
pca_pal <- c( "#EB3D0E", #scarlet - lung
"#0300A6", # blue - BAL
"#F2C121", #- Empty
"#666B6E" #- Water
)
ggplot(pc_plot_data, aes(x = PC1, y = PC2)) +
geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.5) +
geom_segment(aes(x=x_coord, xend = PC1, y=y_coord, yend = PC2), color = "black") +
geom_point(show.legend = FALSE, aes(color = Sample_Type2), size = 3) +
# segment starting at centroid, ending at point
geom_label(aes(x=x_coord, y=y_coord, label=Sample_Type2, color=Sample_Type2), size=3, show.legend = FALSE) + # centroid information
theme_classic() +
theme(legend.title = element_blank(),
axis.text.y = element_text(size=15),
axis.text.x = element_text(size=15),
axis.title.y = element_text(size=12, face="bold", vjust = 0),
axis.title.x = element_text(size=12, face="bold", vjust = 0),
plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm")
) +
scale_color_manual(values = pca_pal) +
scale_x_continuous(limits = c(-1, 1), breaks = seq(-1, 1, 0.5)) +
scale_y_continuous(limits = c(-1.1, 1.1), breaks = seq(-1,1,0.5)) +
labs(
x = "\nPrincipal Component 1 (13.5% explained)",
y = "Principal Component 2 (6.6% explained)\n"
)
#prepare hellinger-transformed df to be filtered for pairwise comparisons
adonis.select.vec <- dplyr::select(otu.df.lungsamp.seqctrls, Sample:Sample_Type)
adonis.hel.df <- rownames_to_column(otu.good.lungsamp.seqctrls_hel, var = "Sample")
adonis.hel.df <- left_join(adonis.hel.df, adonis.select.vec, by = "Sample") %>%
dplyr::select(Sample, Sample_Type, everything())
#multiple comparisons - whole lung v. BAL v. pooled sequencing controls
adonis(otu.good.lungsamp.seqctrls_hel~otu.df.lungsamp.seqctrls$Sample_Type,
method="euclidean",
permutations=10000)
#pairwise comparison - whole lung v. seq ctrls
adonis.hel.sl <- dplyr::filter(adonis.hel.df, Sample_Type != "BAL") %>%
column_to_rownames(var = "Sample") %>%
dplyr::select(-Sample_Type)
adonis.otudf.sl <- dplyr::filter(otu.df.lungsamp.seqctrls, Sample_Type != "BAL")
adonis(adonis.hel.sl~adonis.otudf.sl$RA_Groups, method="euclidean", permutations=10000)
#pairwise comparison - BAL v. seq ctrls
adonis.hel.sb <- dplyr::filter(adonis.hel.df, Sample_Type != "Lung") %>%
column_to_rownames(var = "Sample") %>%
dplyr::select(-Sample_Type)
adonis.otudf.sb <- dplyr::filter(otu.df.lungsamp.seqctrls, Sample_Type != "Lung")
adonis(adonis.hel.sb~adonis.otudf.sb$RA_Groups, method="euclidean", permutations=10000)
Call:
adonis(formula = otu.good.lungsamp.seqctrls_hel ~ otu.df.lungsamp.seqctrls$Sample_Type, permutations = 10000, method = "euclidean")
Permutation: free
Number of permutations: 10000
Terms added sequentially (first to last)
Df SumsOfSqs MeanSqs F.Model R2
otu.df.lungsamp.seqctrls$Sample_Type 3 7.919 2.63959 3.2847 0.16193
Residuals 51 40.984 0.80361 0.83807
Total 54 48.903 1.00000
Pr(>F)
otu.df.lungsamp.seqctrls$Sample_Type 9.999e-05 ***
Residuals
Total
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Call:
adonis(formula = adonis.hel.sl ~ adonis.otudf.sl$RA_Groups, permutations = 10000, method = "euclidean")
Permutation: free
Number of permutations: 10000
Terms added sequentially (first to last)
Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
adonis.otudf.sl$RA_Groups 1 4.146 4.1459 5.1613 0.10717 9.999e-05 ***
Residuals 43 34.541 0.8033 0.89283
Total 44 38.687 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Call:
adonis(formula = adonis.hel.sb ~ adonis.otudf.sb$RA_Groups, permutations = 10000, method = "euclidean")
Permutation: free
Number of permutations: 10000
Terms added sequentially (first to last)
Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
adonis.otudf.sb$RA_Groups 1 2.069 2.06931 2.3933 0.05159 4e-04 ***
Residuals 44 38.044 0.86463 0.94841
Total 45 40.113 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#extract samples of interest with specified strings - returns vector of strings and NAs
extract.tong.lungsamp <- str_extract(rownames(otu_good), pattern="Tong|_BAL_|_Lung_")
#select subsets based on extracted samp names - returns a logical vector (NA --> FALSE)
extract.tong.lungsamp.good <- !is.na(extract.tong.lungsamp)
#subset otu counts
otu.good.tong.lungsamp <- otu_good[extract.tong.lungsamp.good, ]
#subset otu.df for PCA
otu.df.tong.lungsamp <- filter(otu_df, Sample_Type == "Tongue" | Sample_Type == "BAL" | Sample_Type == "Lung")
otu.df.tong.lungsamp <- mutate(otu.df.tong.lungsamp, PCA_Groups = ifelse(
test = Organ == "Lung",
yes = as.character(RA_Groups),
no = as.character(Organ))) %>%
dplyr::select(Sample:Gene_16S_copies_per_mL, PCA_Groups, everything())
otu.df.tong.lungsamp$PCA_Groups <- as.factor(otu.df.tong.lungsamp$PCA_Groups)
#make a hellinger normalized df and pca object from the normalized df
make_hel_pca(otu.good.tong.lungsamp)
tong_lungsamp_pca_summary <- summary(otu.good.tong.lungsamp_pca) # summarize pca object to allow for pulling the coordinates in the PC space
# select PC1 and PC2 for plotting
#"sites" refers to each individual sample, i.e. ecologic sites
pc_space <- tong_lungsamp_pca_summary$sites %>% as.data.frame() %>% rownames_to_column() %>% dplyr::select(1:3)
colnames(pc_space) <- c("Sample_Name", "PC1", "PC2")
pc_loadings <- tong_lungsamp_pca_summary$species # species as loading vectors
# attach your data from the pca object to your labels
centroid_df <- data.frame(Sample_Name = pc_space$Sample_Name, PCA_Groups = otu.df.tong.lungsamp[,"PCA_Groups"], PC1 = pc_space$PC1, PC2 = pc_space$PC2)
# calculate the mean x coordinate and y coordinate for each sample in PC1/PC2 space
centroids <- centroid_df %>%
group_by(PCA_Groups) %>%
summarize(x_coord=mean(PC1), y_coord=mean(PC2)) %>%
ungroup()
# to EACH SUBJECT in the pc object, add a column which contains the x and y
#coordinates for the centroid for the group that the subject belongs to
pc_plot_data <- full_join(centroid_df, centroids, by ="PCA_Groups")
# Now you can use your dataframe and mainpulate it how you want
pc_plot_data$PCA_Groups <- factor(pc_plot_data$PCA_Groups,
levels = c("Lung", "BAL", "Tongue"))
pca_pal <- c( "#EB3D0E", #scarlet - lung
"#0300A6", # blue - BAL
"#FB9986" #vivid tangerine - tongue
)
ggplot(pc_plot_data, aes(x = PC1, y = PC2)) +
geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.5) +
geom_segment(aes(x=x_coord, xend = PC1, y=y_coord, yend = PC2), color = "black") +
geom_point(show.legend = FALSE, aes(color = PCA_Groups), size = 2) +
# segment starting at centroid, ending at point
geom_label(aes(x=x_coord, y=y_coord, label=PCA_Groups, color=PCA_Groups), size=3, show.legend = FALSE) + # centroid information
theme_classic() +
theme(legend.title = element_blank(),
axis.text.y = element_text(size=15),
axis.text.x = element_text(size=15),
axis.title.y = element_text(size=12, face="bold", vjust = 0),
axis.title.x = element_text(size=12, face="bold", vjust = 0),
plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm")
) +
scale_color_manual(values = pca_pal) +
scale_x_continuous(limits = c(-1, 1), breaks = seq(-1, 1, 0.5)) +
scale_y_continuous(limits = c(-1.1, 1.1), breaks = seq(-1,1,0.5)) +
labs(
x = "\nPrincipal Component 1 (10.5% explained)",
y = "Principal Component 2 (4.6% explained)\n"
)
#multiple comparisons - whole lung v. BAL v. tongue
adonis(otu.good.tong.lungsamp_hel~otu.df.tong.lungsamp$Sample_Type,
method="euclidean",
permutations=10000)
#prepare hellinger-transformed df to be filtered for pairwise comparisons
adonis.select.vec <- dplyr::select(otu.df.tong.lungsamp, Sample:Sample_Type)
adonis.hel.df <- rownames_to_column(otu.good.tong.lungsamp_hel, var = "Sample")
adonis.hel.df <- left_join(adonis.hel.df, adonis.select.vec, by = "Sample") %>%
dplyr::select(Sample, Sample_Type, everything())
#pairwise comparison - whole lung v. tongue
adonis.hel.tl <- dplyr::filter(adonis.hel.df, Sample_Type == "Lung" | Sample_Type == "Tongue") %>%
column_to_rownames(var = "Sample") %>%
dplyr::select(-Sample_Type)
adonis.otudf.tl <- dplyr::filter(otu.df.tong.lungsamp,
Sample_Type == "Lung" | Sample_Type == "Tongue")
adonis(adonis.hel.tl~adonis.otudf.tl$Sample_Type, method="euclidean", permutations=10000)
#pairwise comparison - BAL v. tongue
adonis.hel.tb <- dplyr::filter(adonis.hel.df, Sample_Type == "BAL" | Sample_Type == "Tongue") %>%
column_to_rownames(var = "Sample") %>%
dplyr::select(-Sample_Type)
adonis.otudf.tb <- dplyr::filter(otu.df.tong.lungsamp, Sample_Type == "BAL" | Sample_Type == "Tongue")
adonis(adonis.hel.tb~adonis.otudf.tb$Sample_Type, method="euclidean", permutations=10000)
Call:
adonis(formula = otu.good.tong.lungsamp_hel ~ otu.df.tong.lungsamp$Sample_Type, permutations = 10000, method = "euclidean")
Permutation: free
Number of permutations: 10000
Terms added sequentially (first to last)
Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
otu.df.tong.lungsamp$Sample_Type 2 3.9663 1.98317 2.6355 0.12772 9.999e-05
Residuals 36 27.0894 0.75248 0.87228
Total 38 31.0557 1.00000
otu.df.tong.lungsamp$Sample_Type ***
Residuals
Total
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Call:
adonis(formula = adonis.hel.tl ~ adonis.otudf.tl$Sample_Type, permutations = 10000, method = "euclidean")
Permutation: free
Number of permutations: 10000
Terms added sequentially (first to last)
Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
adonis.otudf.tl$Sample_Type 1 1.039 1.03903 1.5005 0.05265 0.0142 *
Residuals 27 18.696 0.69244 0.94735
Total 28 19.735 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Call:
adonis(formula = adonis.hel.tb ~ adonis.otudf.tb$Sample_Type, permutations = 10000, method = "euclidean")
Permutation: free
Number of permutations: 10000
Terms added sequentially (first to last)
Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
adonis.otudf.tb$Sample_Type 1 2.6012 2.60116 3.2809 0.10489 9.999e-05 ***
Residuals 28 22.1989 0.79282 0.89511
Total 29 24.8001 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#extract samples of interest with specified strings - returns vector of strings and NAs
extract.tong.lungsamp.allnegctrls <- str_extract(rownames(otu_good), pattern="Tong|_BAL_|_Lung_|PBS|CLEAN|USED|Homog|AE|Iso|Water|Blank")
#select subsets based on extracted samp names - returns a logical vector (NA --> FALSE)
extract.tong.lungsamp.allnegctrls.good <- !is.na(extract.tong.lungsamp.allnegctrls)
#subset otu counts
otu.good.tong.lungsamp.allnegctrls <- otu_good[extract.tong.lungsamp.allnegctrls.good, ]
#subset otu.df for PCA
otu.df.tong.lungsamp.allnegctrls <- filter(otu_df, Sample_Type == "Tongue" | Sample_Type == "BAL" | Sample_Type == "Lung" | Sample_Type == "PBS" | Sample_Type == "Clean_Rinse" | Sample_Type == "Used_Rinse" | Sample_Type == "Homog_Ctrl" | Sample_Type == "AE" | Sample_Type == "Iso_Ctrl" | Sample_Type == "Water" | Sample_Type == "Empty")
otu.df.tong.lungsamp.allnegctrls <- mutate(otu.df.tong.lungsamp.allnegctrls, PCA_Groups = ifelse(
test = Organ == "Lung",
yes = as.character(RA_Groups),
no = as.character(Organ))) %>%
dplyr::select(Sample:Gene_16S_copies_per_mL, PCA_Groups, everything())
otu.df.tong.lungsamp.allnegctrls$PCA_Groups <- as.factor(otu.df.tong.lungsamp.allnegctrls$PCA_Groups)
#make a hellinger normalized df and pca object from the normalized df
make_hel_pca(otu.good.tong.lungsamp.allnegctrls)
# summarize pca object to allow for pulling the coordinates in the PC space
tong_lungsamp_relevnegctrls_pca_summary <- summary(otu.good.tong.lungsamp.allnegctrls_pca)
# select PC1 and PC2 for plotting
#"sites" refers to each individual sample, i.e. ecologic sites
pc_space <- tong_lungsamp_relevnegctrls_pca_summary$sites %>% as.data.frame() %>% rownames_to_column() %>% dplyr::select(1:3)
colnames(pc_space) <- c("Sample_Name", "PC1", "PC2")
pc_loadings <- tong_lungsamp_relevnegctrls_pca_summary$species # species as loading vectors
# attach your data from the pca object to your labels
centroid_df <- data.frame(Sample_Name = pc_space$Sample_Name, PCA_Groups = otu.df.tong.lungsamp.allnegctrls[,"PCA_Groups"], PC1 = pc_space$PC1, PC2 = pc_space$PC2)
# calculate the mean x coordinate and y coordinate for each sample in PC1/PC2 space
centroids <- centroid_df %>%
group_by(PCA_Groups) %>%
summarize(x_coord=mean(PC1), y_coord=mean(PC2)) %>%
ungroup()
# to EACH SUBJECT in the pc object, add a column which contains the x and y
#coordinates for the centroid for the group that the subject belongs to
pc_plot_data <- full_join(centroid_df, centroids, by ="PCA_Groups")
# Now you can use your dataframe and mainpulate it how you want
pc_plot_data$PCA_Groups <- factor(pc_plot_data$PCA_Groups,
levels = c("Lung", "BAL", "Tongue", "Sampling Control", "Isolation Control", "Sequencing Control"))
pca_pal <- c( "#EB3D0E", #scarlet - lung
"#0300A6", # blue - BAL
"#FB9986", #vivid tangerine - tongue
"#0BDBA7", #sea foam green - sampling ctrls
"#F2C121", #- isolation ctrls
"#666B6E" #- sequencing ctrls
)
# plot PCA
ggplot(pc_plot_data, aes(x = PC1, y = PC2)) +
geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.5) +
geom_segment(aes(x=x_coord, xend = PC1, y=y_coord, yend = PC2), color = "black") +
geom_point(show.legend = FALSE, aes(color = PCA_Groups), size = 2) +
# segment starting at centroid, ending at point
geom_label(aes(x=x_coord, y=y_coord, label=PCA_Groups, color=PCA_Groups), size=3, show.legend = FALSE) + # centroid information
theme_classic() +
theme(legend.title = element_blank(),
axis.text.y = element_text(size=15),
axis.text.x = element_text(size=15),
axis.title.y = element_text(size=12, face="bold", vjust = 0),
axis.title.x = element_text(size=12, face="bold", vjust = 0),
plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm")
) +
scale_color_manual(values = pca_pal) +
scale_x_continuous(limits = c(-0.75, 0.75), breaks = c(-0.75, -0.5, -0.25, 0, 0.25, 0.5, 0.75)) +
scale_y_continuous(limits = c(-0.75, 0.75), breaks = c(-0.75, -0.5, -0.25, 0, 0.25, 0.5, 0.75)) +
labs(
x = "\nPrincipal Component 1 (11.6% explained)",
y = "Principal Component 2 (6.3% explained)\n"
)
#Rarefaction curves can also be used to explore α-diversity. First, the data needs to be prepared.
otu_raw_rare <- otu_raw
rownames(otu_raw_rare) <- str_remove(string = rownames(otu_raw_rare), pattern = "_S\\d+_L001_R1_001") %>% str_remove(pattern = "_S\\d+_L001_R1")
otu_raw_rare <- dplyr::select(otu_raw_rare, -label, -numOtus)
tvl.rownames <- vars_select(rownames(otu_raw_rare), !contains("Mock")) %>% as.data.frame()
colnames(tvl.rownames) <- "Sample"
div.rare.bl <- rownames_to_column(otu_raw_rare, var = "Sample")
#filter out low read sample
div.rare.bl <- left_join(tvl.rownames, div.rare.bl) %>% column_to_rownames(var = "Sample")
# Generate the rarefaction data, combine with metadata, transform for plotting, and aggregate
rarefy_agg <- rarefy(div.rare.bl, sample = 1000) %>% # subsample
as.data.frame() %>%
rownames_to_column("Sample")
colnames(rarefy_agg) <- c("Sample", "Unique_Otus_per_1k_reads")
rarefy_agg <- left_join(rarefy_agg, dplyr::select(otu_df, Sample, Organ, Sample_Type2), by = "Sample")
rarefy_agg <- dplyr::filter(rarefy_agg, !is.na(Organ))
rarefy_agg_summary <- rarefy_agg %>%
group_by(Sample_Type2) %>%
summarize(Mean_Species = mean(Unique_Otus_per_1k_reads),
SEM = sqrt(var(Unique_Otus_per_1k_reads)/length(Unique_Otus_per_1k_reads)))
rarefy_agg$Sample_Type2 <- factor(rarefy_agg$Sample_Type2,
levels = c("Empty", "Water", "AE", "Iso_Ctrl","Syringe_Rinse", "PBS", "Homog_Ctrl","BAL", "Lung", "Tongue", "Cecum"))
rarefy_agg_summary$Sample_Type2 <- factor(rarefy_agg_summary$Sample_Type2,
levels = c("Empty", "Water", "AE", "Iso_Ctrl","Syringe_Rinse", "PBS", "Homog_Ctrl","BAL", "Lung", "Tongue", "Cecum"))
rich_pal <- c( "#EB3D0E", #scarlet - lung
"#0300A6", # blue - BAL
"#FB9986", #vivid tangerine - tongue
"#0BDBA7", #sea foam green - sampling ctrls
"#F2C121", #- isolation ctrls
"#666B6E" #- sequencing ctrls
)
ggplot(rarefy_agg_summary, aes(x = Sample_Type2, y = Mean_Species, fill = Sample_Type2)) + # This brings in the aggregated dataframe and assigns columns to various plot elements
geom_col(color="black", show.legend = FALSE) + # Add bars representing the mean of specimens' diversity
geom_point(data = rarefy_agg, aes(x = Sample_Type2, y = Unique_Otus_per_1k_reads ), width = .2, show.legend = FALSE) + # Use specimen level data, map columns to scatterplot attributes
geom_errorbar(aes(ymin = Mean_Species - SEM, ymax = Mean_Species + SEM), width = 0.5, size=0.7) + # Add an errorbar layer
theme_classic() + # A minimalist theme
theme(axis.title.y = element_text(face = "bold", size = 12),
axis.text.y = element_text(size = 15),
axis.text.x = element_text(size = 10),
axis.title.x = element_text(face = "bold", size = 12),
plot.margin=unit(c(0.5,0.5,0.5,1), "cm")) +
scale_y_continuous(limits = c(0,150), breaks = seq(0,150,25), expand = c(0,0)) +
labs(y = "Richness\n(No. of Unique OTUs/1000 reads)\n", x = "\nSample Type")
otu_df <- mutate(otu_df, Alpha_Div_Groups = ifelse(test = Organ != "Lung",
yes = as.character(Organ),
no = as.character(Sample_Type))) %>%
dplyr::select(Sample:Gene_16S_copies_per_mL, Alpha_Div_Groups, everything())
otu_df <- left_join(otu_df, dplyr::select(rarefy_agg, Sample, Unique_Otus_per_1k_reads)) %>%
dplyr::select(Sample:Gene_16S_copies_per_mL, Unique_Otus_per_1k_reads, everything())
tukey_otu_df <- filter(otu_df, Sample_Type!="Mock" & Sample_Type!="Cecum" & Sample_Type!="Tongue")
TukeyHSD(aov(tukey_otu_df[,"Unique_Otus_per_1k_reads"] ~ tukey_otu_df[, "Alpha_Div_Groups"]))
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = tukey_otu_df[, "Unique_Otus_per_1k_reads"] ~ tukey_otu_df[, "Alpha_Div_Groups"])
$`tukey_otu_df[, "Alpha_Div_Groups"]`
diff lwr upr
Isolation Control-BAL -10.612291 -27.006526 5.7819429
Lung-BAL 25.049970 7.457534 42.6424053
Sampling Control-BAL -2.528455 -20.690372 15.6334619
Sequencing Control-BAL -13.515401 -27.202081 0.1712778
Lung-Isolation Control 35.662261 18.778539 52.5459833
Sampling Control-Isolation Control 8.083837 -9.392476 25.5601491
Sequencing Control-Isolation Control -2.903110 -15.666004 9.8597843
Sampling Control-Lung -27.578425 -46.183380 -8.9734692
Sequencing Control-Lung -38.565371 -52.834721 -24.2960216
Sequencing Control-Sampling Control -10.986946 -25.952766 3.9788736
p adj
Isolation Control-BAL 0.3746804
Lung-BAL 0.0014826
Sampling Control-BAL 0.9950044
Sequencing Control-BAL 0.0545862
Lung-Isolation Control 0.0000011
Sampling Control-Isolation Control 0.6949822
Sequencing Control-Isolation Control 0.9684857
Sampling Control-Lung 0.0008519
Sequencing Control-Lung 0.0000000
Sequencing Control-Sampling Control 0.2510366
#calculate Shannon diversity indices for all samples
div.shan <- diversity(otu_good, "shannon") %>% data.frame(Shannon = .) %>% rownames_to_column("Sample") %>% left_join(otu_df[,1:6], by = "Sample")
otu_df <- mutate(otu_df, Shannon = diversity(otu_good, "shannon")) %>% dplyr::select(Sample:Alpha_Div_Groups, Shannon, everything())
# Create summary statistics
agg_shan_all <- div.shan %>%
group_by(Sample_Type2) %>%
summarize(Mean_Sh = mean(Shannon), SEM_Sh = sqrt(var(Shannon)/length(Shannon)))
agg_shan_all$Sample_Type2 <- factor(agg_shan_all$Sample_Type2,
levels = c("Mock", "Empty", "Water", "AE", "Iso_Ctrl","Syringe_Rinse", "PBS", "Homog_Ctrl","BAL", "Lung", "Tongue", "Cecum"))
agg_shan_all <- filter(agg_shan_all, Sample_Type2 != "Mock")
div.shan$Sample_Type2 <- factor(div.shan$Sample_Type2, levels = c("Mock", "Empty", "Water", "AE", "Iso_Ctrl","Syringe_Rinse", "PBS", "Homog_Ctrl","BAL", "Lung", "Tongue", "Cecum"))
div.shan <- filter(div.shan, Sample_Type2 != "Mock")
ggplot(agg_shan_all, aes(x = Sample_Type2, y = Mean_Sh, fill = Sample_Type2)) + # This brings in the aggregated dataframe and assigns columns to various plot elements
geom_col(color="black", show.legend = FALSE) + # Add bars representing the mean of specimens' Shannon diversity by vendor
geom_point(data = div.shan, aes(x = Sample_Type2, y = Shannon), width = .2, show.legend = FALSE) + # Use specimen level data, map columns to scatterplot attributes
geom_errorbar(aes(ymin = Mean_Sh - SEM_Sh, ymax = Mean_Sh + SEM_Sh), width = 0.5, size=0.7) + # Add an errorbar layer
theme_classic() + # A minimalist theme
theme(axis.title.y = element_text(face = "bold", size = 15),
axis.text.y = element_text(size = 15),
axis.text.x = element_text(size = 10),
axis.title.x = element_text(face = "bold", size = 15),
plot.margin=unit(c(0.5,0.5,0.5,1), "cm")) +
scale_y_continuous(limits = c(-0.01,4.5), breaks = seq(0,4,1), expand = c(0,0)) +
labs(y = "Shannon Diversity Index\n", x = "\nSample Type")
tukey_otu_df <- dplyr::filter(otu_df, Sample_Type != "Mock" & Sample_Type != "Cecum" & Sample_Type != "Tongue")
TukeyHSD(aov(tukey_otu_df[,"Shannon"] ~ tukey_otu_df[, "Alpha_Div_Groups"]))
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = tukey_otu_df[, "Shannon"] ~ tukey_otu_df[, "Alpha_Div_Groups"])
$`tukey_otu_df[, "Alpha_Div_Groups"]`
diff lwr upr p adj
Isolation Control-BAL -0.9947478 -1.8514868 -0.1380087 0.0147639
Lung-BAL 0.6346623 -0.2846930 1.5540176 0.3098538
Sampling Control-BAL -0.5859416 -1.5350572 0.3631740 0.4234102
Sequencing Control-BAL -1.2226705 -1.9379166 -0.5074244 0.0000876
Lung-Isolation Control 1.6294101 0.7470911 2.5117290 0.0000205
Sampling Control-Isolation Control 0.4088062 -0.5044807 1.3220931 0.7201786
Sequencing Control-Isolation Control -0.2279227 -0.8948932 0.4390477 0.8732743
Sampling Control-Lung -1.2206039 -2.1928720 -0.2483357 0.0067338
Sequencing Control-Lung -1.8573328 -2.6030284 -1.1116372 0.0000000
Sequencing Control-Sampling Control -0.6367289 -1.4188211 0.1453633 0.1637833
bray_df_lbtce <- dplyr::filter(otu_df, Sample_Type2 == "Lung" | Sample_Type2 == "Tongue" | Sample_Type2 == "Cecum" | Sample_Type2 == "BAL" | Sample_Type2 == "Empty") %>%
droplevels() %>%
column_to_rownames("Sample") %>% # This turns the specimen column into the rownames.
dplyr::select(-c(Sample_Type:Alpha_Div_Groups, Shannon))
#identify all unique comparisons within sample types
sample_df <- rownames(bray_df_lbtce) %>% as.data.frame()
colnames(sample_df) <- c("Sample")
sample_df <- mutate(sample_df, Sample_Type = factor(str_extract(Sample, pattern="Lung|BAL|Tongue|Cecum|Blank")))
dim(sample_df) #[1] 87 2
comb_all <- combinations(n = 87, r = 2, v = sample_df$Sample) %>% as.data.frame()
colnames(comb_all) <- c("Sample", "Comparison")
comb_all_mut <- mutate(comb_all, Sample_Type = factor(str_extract(Sample, pattern="Lung|BAL|Tongue|Cecum|Blank")),
Comparison_Type = factor(str_extract(Comparison, pattern="Lung|BAL|Tongue|Cecum|Blank")))
comb_uniq <- dplyr::filter(comb_all_mut, (Sample_Type == Comparison_Type))
#calculate distance, convert to symmetric matrix, and calculate row means (one mean per sample)
bray_dist_lbtce <- vegdist(bray_df_lbtce, method = "bray")
bray_dist_lbtce <- as.matrix(bray_dist_lbtce) %>% as.data.frame() %>% rownames_to_column(var="Sample")
bray_dist_lbtce <- dplyr::select(bray_dist_lbtce, Sample, everything())
bray_dist_lbtce_long <- pivot_longer(bray_dist_lbtce, -Sample, names_to = "Comparison", values_to = "BC_Index")
bray_dist_lbtce_long <- dplyr::filter(bray_dist_lbtce_long, Sample != Comparison)
bray_dist_lbtce_long_mut_filt_uniq <- left_join(comb_uniq, bray_dist_lbtce_long)
bray_dist_lbtce_summary <- group_by(bray_dist_lbtce_long_mut_filt_uniq, Sample) %>%
summarize(Mean_BC = mean(BC_Index), SEM = sem(BC_Index))
bray_dist_lbtce_summary <- ungroup(bray_dist_lbtce_summary) %>%
mutate(Sample_Type = factor(str_extract(Sample,pattern="Lung|BAL|Tongue|Cecum|Blank")))
bray_dist_lbtce_summary$Sample_Type <- factor(bray_dist_lbtce_summary$Sample_Type,
levels = c("Blank", "BAL","Lung", "Tongue", "Cecum"))
ggplot(bray_dist_lbtce_summary, aes(x=Sample_Type, y = Mean_BC)) +
geom_boxplot(aes(fill=Sample_Type), show.legend = FALSE) +
geom_point() +
scale_y_continuous(limits = c(0,1.01), expand = c(0,0)) +
theme_classic() +
theme(axis.title.y = element_text(face = "bold", size = 15),
axis.text.y = element_text(size = 15),
axis.text.x = element_text(size = 10),
axis.title.x = element_text(face = "bold", size = 15),
plot.margin=unit(c(0.5,0.5,0.5,1), "cm")) +
labs(y = "Bray-Curtis Dissimilarity\n", x = "\nSample Type") +
scale_fill_manual(values=c("#DBE6EC","#0300A6", "#EB3D0E", "#0BDBA7", "#0BDBA7"))
[1] 87 2
pairwise.wilcox.test(bray_dist_lbtce_long_mut_filt_uniq$BC_Index,
bray_dist_lbtce_long_mut_filt_uniq$Sample_Type,
p.adjust.method = "BH")
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: bray_dist_lbtce_long_mut_filt_uniq$BC_Index and bray_dist_lbtce_long_mut_filt_uniq$Sample_Type
BAL Blank Cecum Lung
Blank 0.2730 - - -
Cecum < 2e-16 < 2e-16 - -
Lung 6.3e-09 7.4e-06 < 2e-16 -
Tongue < 2e-16 8.5e-13 < 2e-16 0.0053
P value adjustment method: BH
bray_df_tvbl <- dplyr::filter(otu_df, Organ == "Lung" | Organ == "Tongue") %>%
droplevels() %>%
column_to_rownames("Sample") %>% # This turns the specimen column into the rownames.
dplyr::select(-c(Sample_Type:Shannon))
#calculate distance, convert to symmetric matrix, and calculate row means (one mean per sample)
bray_dist_tvbl <- vegdist(bray_df_tvbl, method = "bray")
bray_dist_tvbl <- as.matrix(bray_dist_tvbl) %>% as.data.frame() %>% rownames_to_column(var="Sample")
bray_dist_tvbl_long <- pivot_longer(bray_dist_tvbl, -Sample, names_to = "Comparison", values_to = "BC_Index")
bray_dist_tvbl_long <- dplyr::filter(bray_dist_tvbl_long, Sample != Comparison)
bray_dist_tvbl_long_mut <- dplyr::mutate(bray_dist_tvbl_long,
Sample_Mouse = factor(str_extract(Sample, pattern="L\\d+$|B\\d+$")),
Comparison_Mouse = factor(str_extract(Comparison, pattern="L\\d+$|B\\d+$")),
Sample_Type = factor(str_extract(Sample, pattern="Lung|BAL|Tongue")),
Comparison_Type = factor(str_extract(Comparison, pattern="Lung|BAL|Tongue")))
bray_dist_tvbl_long_mut_filt <- dplyr::filter(bray_dist_tvbl_long_mut, (Sample_Type != Comparison_Type) & (Sample_Mouse == Comparison_Mouse) & Sample_Type == "Tongue")
ggplot(bray_dist_tvbl_long_mut_filt, aes(x = Comparison_Type, y = BC_Index)) +
geom_boxplot(aes(fill=Comparison_Type), show.legend = FALSE) +
geom_point() +
scale_y_continuous(limits = c(0,1.1), expand = c(0,0)) +
theme_classic() +
theme(axis.title.y = element_text(face = "bold", size = 15),
axis.text.y = element_text(size = 15),
axis.text.x = element_text(size = 10),
axis.title.x = element_text(face = "bold", size = 15),
plot.margin=unit(c(0.5,0.5,0.5,1), "cm")) +
ylab("Bray-Curtis Dissimilarity\nto Matched Tongue Sample\n") +
scale_fill_manual(values=c( "#0300A6", "#EB3D0E"))
#Mann-Whitney U test
wilcox.test(BC_Index ~ Comparison_Type, data = bray_dist_tvbl_long_mut_filt)
Wilcoxon rank sum exact test
data: BC_Index by Comparison_Type
W = 85, p-value = 0.0004114
alternative hypothesis: true location shift is not equal to 0
otu_df <- dplyr::mutate(otu_df, RA_Groups = ifelse(test = is.na(RA_Groups),
yes = as.character(Sample_Type),
no = as.character(RA_Groups)
)) %>%
dplyr::select(Sample:Shannon, RA_Groups, everything())
otu_df$RA_Groups <- as.factor(otu_df$RA_Groups)
gg.bal.lung.neg.tong.RAG.ordL <- filter(otu_df, Sample_Type != "Cecum" & Sample_Type != "Mock") %>%
taxon_sort_gather(facet_var="RA_Groups", ord_val = "Lung")
gg.bal.lung.neg.tong.RAG.ordL$RA_Groups <- factor(gg.bal.lung.neg.tong.RAG.ordL$RA_Groups, levels = c("Lung","Tongue", "BAL", "Negative Control"), labels = c("Lung","Tongue", "BAL", "Negative\nControls"))
plot_ra(gg.bal.lung.neg.tong.RAG.ordL, fill = "RA_Groups", error_bar = T) + facet_grid(RA_Groups~.) +
theme_bw() +
geom_col(color="black", aes(fill=RA_Groups)) +
theme(axis.text.x = element_text(size = 20, angle = 45, hjust = 1, vjust=1),
axis.text.y = element_text(size = 22),
axis.title.y = element_text(face = "bold", size = 30),
axis.title.x = element_text(),
strip.text = element_text(face = "bold", size = 25, angle = 0),
strip.background = element_blank(),
panel.grid = element_blank(),
legend.position = "none",
panel.spacing = unit(2, "lines"),
plot.margin=unit(c(0.5,0.5,0.5,6), "cm")) +
scale_y_continuous(expand = c(0,0), limits = c(0,25), labels = dollar_format(suffix = "%", prefix = "")) +
labs(y = "Relative Abundance (%)\n", x = "\nOTU") +
scale_fill_manual(values=c("#EB3D0E", "#FB9986", "#0300A6", "#666B6E"))
gg.bal.lung.neg.tong.RAG.ordN <- filter(otu_df, Sample_Type != "Cecum" & Sample_Type != "Mock") %>%
taxon_sort_gather(facet_var="RA_Groups", ord_val = "Negative Control")
gg.bal.lung.neg.tong.RAG.ordN$RA_Groups <- factor(gg.bal.lung.neg.tong.RAG.ordN$RA_Groups, levels = c("Negative Control", "BAL", "Lung", "Tongue"), labels = c("Negative\nControls", "BAL", "Lung", "Tongue"))
plot_ra(gg.bal.lung.neg.tong.RAG.ordN, fill = "RA_Groups", error_bar = T) + facet_grid(RA_Groups~.) +
theme_bw() +
geom_col(color="black", aes(fill=RA_Groups)) +
theme(axis.text.x = element_text(size = 20, angle = 45, hjust = 1, vjust=1),
axis.text.y = element_text(size = 22),
axis.title.y = element_text(face = "bold", size = 30),
axis.title.x = element_text(),
strip.text = element_text(face = "bold", size = 25, angle = 0),
strip.background = element_blank(),
panel.grid = element_blank(),
legend.position = "none",
panel.spacing = unit(2, "lines"),
plot.margin=unit(c(0.5,0.5,0.5,6), "cm")) +
scale_y_continuous(expand = c(0,0), limits = c(0,25), labels = dollar_format(suffix = "%", prefix = "")) +
labs(y = "Relative Abundance (%)\n", x = "\nOTU") +
scale_fill_manual(values=c("#666B6E", "#0300A6", "#EB3D0E", "#FB9986"))
gg.bal.lung.neg.tong.RAG.ordT <- filter(otu_df, Sample_Type != "Cecum" & Sample_Type != "Mock") %>%
taxon_sort_gather(facet_var="RA_Groups", ord_val = "Tongue")
gg.bal.lung.neg.tong.RAG.ordT$RA_Groups <- factor(gg.bal.lung.neg.tong.RAG.ordT$RA_Groups, levels = c("Tongue", "Lung","BAL", "Negative Control"), labels = c("Tongue", "Lung","BAL", "Negative\nControls"))
plot_ra(gg.bal.lung.neg.tong.RAG.ordT, fill = "RA_Groups", error_bar = T) + facet_grid(RA_Groups~.) +
theme_bw() +
geom_col(color="black", aes(fill=RA_Groups)) +
theme(axis.text.x = element_text(size = 20, angle = 45, hjust = 1, vjust=1),
axis.text.y = element_text(size = 22),
axis.title.y = element_text(face = "bold", size = 30),
axis.title.x = element_text(),
strip.text = element_text(face = "bold", size = 25, angle = 0),
strip.background = element_blank(),
panel.grid = element_blank(),
legend.position = "none",
panel.spacing = unit(2, "lines"),
plot.margin=unit(c(0.5,0.5,0.5,6), "cm")) +
scale_y_continuous(expand = c(0,0), limits = c(0,25), labels = dollar_format(suffix = "%", prefix = "")) +
labs(y = "Relative Abundance (%)\n", x = "\nOTU") +
scale_fill_manual(values=c("#FB9986", "#EB3D0E", "#0300A6", "#666B6E"))